Preliminaries

First we load some packages – two of them, concordances and collostructions, are not on CRAN and can therefore not be installed via install.packages. collostructions is available on Susanne Flach’s website, concordances is available on Github and can be installed using devtools::install_github (package devtools needs to be installed; if it is not installed yet, you can install it from CRAN.)

# install concordances package (if not yet installed)
if(!is.element("concordances", installed.packages())) {
  devtools::install_github("hartmast/concordances")
}

# load packages
library(collostructions) # available at sfla.ch
library(concordances)
library(tidyverse)
library(data.table)
library(ggraph)
library(igraph)
library(networkD3)
library(DT)
library(readxl)
library(vroom)

Queries

We used DECOW to search for quantifier/degree-modifier constructions. The following queries were used (followed by the number of matches):

# list of files
f <- list.files(pattern = "xml")

# get queries from concordance file:
sapply(1:length(f), function(i) trimws(gsub("<query>|</query>", "", readLines(f[i], n = 7)[6])))
##  [1] "[word=\"ein(e[nm])?\"] [word=\"[F]ünkchen\"] [tag=\"N.*|ADJ.*|V.*\"] 2948"        
##  [2] "[word=\"ein(e[nm])?\"] [word=\"[F]ünkchen\"] \"zu\" [] 10"                        
##  [3] "[word=\"[eE]in(e[nm])?\"] [word=\"[Qq]u[eä]ntchen\"] [tag=\"N.*|ADJ.*|V.*\"] 3423"
##  [4] "[word=\"[Ee]in(e[mn])?\"] [word=\"[Hh]auch\"] [tag=\"N.*|ADJ.*\"] 16716"          
##  [5] "[word=\"[Ee]in(e[mn])?\"] [word=\"[Hh]auch\"] \"zu\" [] 781"                      
##  [6] "[word=\"[eE]in(e[nm])?\"] [word=\"[Qq]u[eä]ntchen\"] \"zu\" [] 76"                
##  [7] "[word=\"[Ee]in(e[nm])?\"] [word=\"[ZTzt]acken\"] [tag=\"N.*|ADJ.*\"] 1830"        
##  [8] "[word=\"[Ee]in(e[nm])?\"] [word=\"[ZTzt]acken\"] \"zu\" [] 392"                   
##  [9] "[word=\"[Ee]in(e[mn])?\"] [word=\"[Tt]ick\"] [tag=\"ADJ.*|N.*\"] 17707"           
## [10] "[word=\"[Ee]in(e[nm])?\"] [word=\"[Tt]ick\"] [word=\"zu\"] [] 6032"               
## [11] "[word=\"[Ee]iner?\"] [word=\"[Hh]andvoll\"] [tag=\"N.*\"] 35998"                  
## [12] "[word=\"[Ee]iner?\"] [word=\"[Ii]dee\"] [tag=\"N.*|ADJ.*\"] 3900"                 
## [13] "[word=\"[Ee]iner?\"] [word=\"[Id]dee\"] \"zu\" [tag=\"ADJ.*\"] 349"               
## [14] "[word=\"[Ee]iner?\"] [word=\"[Ss]pur\"] [tag=\"N.*|ADJ.*\"] 11192"                
## [15] "[word=\"[Ee]iner?\"] [word=\"[Ss]pur\"] \"zu\" [tag=\"ADJ.*\"] 3442"

Read in data

We use the concordances package to read in the data.

# read data ---------------------------------------------------------------

fuenk <- getNSE("ein_em_Fuenkchen_ADJ_N.xml", xml = T, tags = T, context_tags = F, verbose = T)
fuenk_zu <- getNSE("ein_em_Fuenkchen_zu.xml", xml = T, tags = T, context_tags = F, verbose = T)
tack_zack <- getNSE("ein_enm_Tacken_Zacken_N_ADJ.xml", xml = T, context_tags = F)
tack_zack_zu <- getNSE("ein_enm_Tacken_Zacken_zu.xml", xml = T, context_tags = F)
handvoll <- getNSE("eine_r_Handvoll_ADJ_N.xml", xml = T, context_tags = F, tags = T)
idee <- getNSE("eine_r_Idee_ADJ_N.xml", xml = T, context_tags = F, tags = T)
idee_zu <- getNSE("eine_r_Idee_zu_ADJ.xml", xml = T, context_tags = F, tags = T)
tick <- getNSE("ein_enm_Tick_ADJ_N.xml", xml = T, context_tags = F, tags = T)
tick_zu <- getNSE("ein_enm_Tick_zu.xml", xml = T, context_tags = F, tags = T)
bisschen <- fread("ein_bisschen_adj_n_frequency_list.txt", col.names = c("Token", "Freq", "bla"))
hauch <- getNSE("ein_enm_Hauch_ADJ_N.xml", xml = T, context_tags = F, tags = T)
hauch_zu <- getNSE("ein_enm_Hauch_zu.xml", xml = T, context_tags = F, tags = T)
spur <- getNSE("eine_r_Spur_N_Adj.xml", xml = T, context_tags = F, tags = T)
spur_zu <- getNSE("eine_r_Spur_zu_ADJ.xml", xml = T, context_tags = F, tags = T)
quaentchen <- getNSE("ein_emn_Quäentchen_N_ADJ_V.xml", xml = T, context_tags = F, tags = T)
quaentchen_zu <- getNSE("ein_enm_Quäentchen_zu.xml", xml = T, context_tags = F, tags = T)

Data wrangling

We write and use a function to remove duplicates; we combine the concordances for “ein(e) X ADJ/N” and “ein(e) X zu ADJ”; and we add a lemma column to each concordance (using the automatic annotation).

# function for removing duplicates -----------
remove_duplicates <- function(df) {
  x <- which(duplicated(df$Left) && 
               duplicated(df$Key) &&
               duplicated(df$Right))
  
  if(length(x) > 0) {
    df <- df[-x,]
  }
  
  return(df)
  
}

# get modified nouns and adjectives in
# "bisschen" dataframe
bisschen[, Lemma := gsub("ein bisschen ", "", bisschen$Token)]

# remove duplicates
idee <- remove_duplicates(idee)
tick <- remove_duplicates(tick)
handvoll <- remove_duplicates(handvoll)
tack_zack <- remove_duplicates(tack_zack)
fuenk <- remove_duplicates(fuenk)
hauch <- remove_duplicates(hauch)
spur <- remove_duplicates(spur)
idee_zu <- remove_duplicates(idee_zu)
tick_zu <- remove_duplicates(tick_zu)
tack_zack_zu <- remove_duplicates(tack_zack_zu)
fuenk_zu <- remove_duplicates(fuenk_zu)
hauch_zu <- remove_duplicates(hauch_zu)
spur_zu <- remove_duplicates(spur_zu)
quaentchen <- remove_duplicates(quaentchen)
quaentchen_zu <- remove_duplicates(quaentchen_zu)

# combine "zu" and "normal" ones:
idee <- rbind(mutate(idee), cxn_type = "ADJ_N",
      mutate(idee_zu), cxn_type = "zu_ADJ")
spur <- rbind(mutate(spur), cxn_type = "ADJ_N",
              mutate(spur_zu), cxn_type = "zu_ADJ")
fuenk <- rbind(mutate(fuenk), cxn_type = "ADJ_N",
              mutate(fuenk_zu), cxn_type = "zu_ADJ")
spur <- rbind(mutate(spur), cxn_type = "ADJ_N",
              mutate(spur_zu), cxn_type = "zu_ADJ")
tack_zack <- rbind(mutate(tack_zack), cxn_type = "ADJ_N",
              mutate(tack_zack_zu), cxn_type = "zu_ADJ")
tick <- rbind(mutate(tick), cxn_type = "ADJ_N",
              mutate(tick_zu), cxn_type = "zu_ADJ")
hauch <- rbind(mutate(hauch), cxn_type = "ADJ_N",
              mutate(hauch_zu), cxn_type = "zu_ADJ")
quaentchen <- rbind(mutate(quaentchen), cxn_type = "ADJ_N",
                    mutate(quaentchen_zu), cxn_type = "zu_ADJ")


# add lemma column
idee$Lemma <- last_left(idee, Tag3_Key, 1)
tick$Lemma <- last_left(tick, Tag3_Key, 1)
fuenk$Lemma <- last_left(fuenk, Tag3_Key, 1)
tack_zack$Lemma <- last_left(tack_zack, Tag3_Key, 1)
handvoll$Lemma <- last_left(handvoll, Tag3_Key, 1)
spur$Lemma <- last_left(spur, Tag3_Key, 1)
hauch$Lemma <- last_left(hauch, Tag3_Key, 1)
quaentchen$Lemma <- last_left(quaentchen, Tag3_Key, 1)

In the case of Idee, and to a lesser extent in the case of Hauch and Spur, there are still many false hits, so we export it for annotation…

# write_csv(idee, "idee_for_anno.csv")

# Hauch: add last_left of keyword
# hauch$Key_modified <- last_left(hauch$Key, n = 1, omit_punctuation = FALSE)

# spur$Key_modified <- last_left(spur$Key, n = 1, omit_punctuation = FALSE)

# write_csv(hauch, "hauch_for_anno.csv")
# write_csv(spur, "spur_for_anno.csv")

We re-import the annotated datafiles:

# import data
idee <- read_xlsx("idee_for_anno.xlsx")
hauch <- read_xlsx("hauch_for_anno.xlsx")
spur <- read_xlsx("spur_for_anno.xlsx")

# remove false hits
idee <- filter(idee, keep == "y")
hauch <- filter(hauch, Modifier == "y")
spur <- filter(spur, Modifier == "y")

As an intermediate step, we create a large dataframe containing all attestations together with more information about their source, taken from the DECOW document list.

# combine all:
d_all <- rbind(select(fuenk, c("Metatag1", "Left", "Key", "Right")),
      select(handvoll, c("Metatag1", "Left", "Key", "Right")),
      select(hauch, c("Metatag1", "Left", "Key", "Right")),
      select(idee, c("Metatag1", "Left", "Key", "Right")),
      select(quaentchen, c("Metatag1", "Left", "Key", "Right")),
      select(spur, c("Metatag1", "Left", "Key", "Right")),
      select(tack_zack, c("Metatag1", "Left", "Key", "Right")),
      select(tick, c("Metatag1", "Left", "Key", "Right")))

# list of DECOW documents
decowdoc <- vroom("/Volumes/My Passport/DECOW16BX-Corex/decow16b.doc.csv.gz", col_names = paste0("V", c(1:85)))

# only keep relevant columns
decowdoc <- decowdoc[,c(1:4)]

# join with d_all
d_all <- left_join(d_all, decowdoc, by = c("Metatag1" = "V4"))

# export
write_excel_csv(d_all, "d_all.csv")
# re-import
d_all <- read_csv("d_all.csv")
## Rows: 104480 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): Metatag1, Left, Key, Right, V1, V2, V3
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

The full list is available here.

A cursory glance at the instances where the modified item is a verb shows that most if not all of them are false hits, hence we will exclude them from further analysis.

fuenk <- fuenk[grep("^V.*", last_left(fuenk$Tag2_Key, n = 1), invert = T),]
hauch <- hauch[grep("^V.*", last_left(hauch$Tag2_Key, n = 1), invert = T),]
tick <- tick[grep("^V.*", last_left(tick$Tag2_Key, n = 1), invert = T),]
quaentchen <- quaentchen[grep("^V.*", last_left(quaentchen$Tag2_Key, n = 1), invert = T),]
tack_zack <- tack_zack[grep("^V.*", last_left(tack_zack$Tag2_Key, n = 1), invert = T),]
tick <- tick[grep("^V.*", last_left(tick$Tag2_Key, n = 1), invert = T),]

Summary statistics

How often do the individual constructions combine with nouns and adjectives etc.?

# function for getting the distribution:
get_distro <- function(vec) {
  x <- gsub("(?<=.).*", "", last_left(trimws(vec), n = 1), perl = T) %>% table
  y <- x[which(names(x) %in% c("A", "N", "V"))]
  y <- c(y, "other" = sum(x[which(!names(x) %in% c("A", "N", "V"))]))
  return(y)
}

# function for finding comparatives:
get_compar <- function(df) {
  # find comparatives
  find_comparatives <- which(grepl("ADJ.*", last_left(df$Tag2_Key, n = 1)) &
grepl("er(e|es|en)?$", trimws(df$Key)))
  
  # add to df
  df$comparative <- sapply(1:nrow(df), function(i) ifelse(i %in% find_comparatives, "yes", "no"))
  
  return(table(df$comparative))
  
  
}

# get "zu ADJ"
get_zu <- function(df) {
  return(length(which(sapply(1:nrow(df), function(i) unlist(strsplit(df$Key[i], " "))[3])=="zu")))
}


# get POS distributions
get_distro(fuenk$Tag2_Key) %>% as.data.frame %>% t()
##     A    N other
## . 156 2713     3
distro <- bind_rows(
  get_distro(fuenk$Tag2_Key),
get_distro(handvoll$Tag2_Key),
get_distro(idee$Tag2_Key),
get_distro(hauch$Tag2_Key),
get_distro(quaentchen$Tag2_Key),
get_distro(spur$Tag2_Key),
get_distro(tack_zack[grepl("Tacken", tack_zack$Key, ignore.case = T),]$Tag2_Key),
get_distro(tack_zack[grepl("Zacken", tack_zack$Key, ignore.case = T),]$Tag2_Key),
get_distro(tick$Tag2_Key)
) %>% as_tibble %>% mutate(Cxn = c("Fünkchen", "Handvoll", "Idee", "Hauch", "Quäntchen", "Spur", "Tacken", "Zacken", "Tick")) %>% replace_na(list(A = 0, N = 0, V = 0))

# get comparative distributions
distro <- mutate(distro, comparatives = c(
  get_compar(fuenk)[2],
get_compar(handvoll)[2],
get_compar(idee)[2],
get_compar(hauch)[2],
get_compar(quaentchen)[2],
get_compar(spur)[2],
get_compar(tack_zack[grepl("Tacken", tack_zack$Key, ignore.case = T),])[2],
get_compar(tack_zack[grepl("Zacken", tack_zack$Key, ignore.case = T),])[2],
get_compar(tick)[2]
)) %>% replace_na(list(comparatives = 0))


# zu...
distro <- mutate(distro, zu = c(
  get_zu(fuenk),
get_zu(handvoll),
get_zu(idee),
get_zu(hauch),
get_zu(quaentchen),
get_zu(spur),
get_zu(tack_zack[grepl("Tacken", tack_zack$Key, ignore.case = T),]),
get_zu(tack_zack[grepl("Zacken", tack_zack$Key, ignore.case = T),]),
get_zu(tick)
))

# column with comparatives and "zu" in ADJ column
distro$ADJ <- paste0(distro$A, " (", distro$comparatives, "/", distro$zu, ")")
distro <- rename(distro, c("ADJ (comparative / excessive)" = "ADJ"))

# add column with sum total
distro$sum <- distro$A + distro$N  + distro$other

# reorder columns
distro[,c(4,2,6,3,7)] %>% datatable()

We use the list of lemmas attested in the concordances to extract their total frequency in the DECOW corpus from the DECOW lemma frequency list.

# list of all lemmas across dfs
lemmas_all <- c(idee$Lemma, tick$Lemma, fuenk$Lemma, tack_zack$Lemma,
  handvoll$Lemma, bisschen$Lemma, spur$Lemma, hauch$Lemma, 
  quaentchen$Lemma) %>% unique


# collostructional analyses -----------------------------------------------
# 
# read DECOW lemma frequencies
decow <- fread("/Volumes/TOSHIBA EXT/DECOW ngrams/decow16bx.lp.tsv")

# only keep verbs, nouns and adjectives
decow01 <- decow[V2 %in% c("NN", "ADJD", "ADJA", "VAINF", "VVFIN", "VVINF", "VAPP", "VVPP", "VVIZU", "VAIMP")]
colnames(decow01) <- c("lemma", "pos", "Freq")

# count POS
pos_tbl <- decow01 %>% group_by(pos) %>% summarise(
  Freq = sum(Freq)
)

# only keep lemmas attested in the constructions
decow <- decow01[lemma %in% lemmas_all]

# export 
# saveRDS(decow, "decow_modifier_lemmas.Rds")
#saveRDS(pos_tbl, "pos_tbl.Rds")
# re-import
decow <- readRDS("decow_modifier_lemmas.Rds")
pos_tbl <- readRDS("pos_tbl.Rds")

Some of the lemmas in the decow dataframe occur more than once (e.g. because they have multiple POS tags), so we have to sum them up first. Also, the idee dataframe still contains many false hits, so we limit it to its most frequent domain by far, comparatives.

# sum up frequencies of lemmas occuring more than once
decow_sum <- decow %>% group_by(lemma) %>% summarise(
  Freq = sum(Freq)
)

Collostructional analysis

We have to do some more data wrangling in order to create the input dataframes for collostructional analysis.

# frequency tables for the different constructions
idee_tbl <- idee %>% select(Lemma) %>% table %>% as.data.frame
fuenk_tbl <- fuenk %>% select(Lemma) %>% table %>% as.data.frame
handvoll_tbl <- handvoll %>%  select(Lemma) %>% table %>% as.data.frame
tick_tbl <- tick %>%  select(Lemma) %>% table %>% as.data.frame
tack_tbl <- tack_zack[grepl("Tacken", tack_zack$Key, ignore.case = T),] %>% 
  select(Lemma) %>% table %>% as.data.frame
zack_tbl <- tack_zack[grepl("Zacken", tack_zack$Key, ignore.case = T),] %>% 
  select(Lemma) %>% table %>% as.data.frame
hauch_tbl <- hauch %>%  select(Lemma) %>% table %>% as.data.frame
spur_tbl <- spur %>%  select(Lemma) %>% table %>% as.data.frame
quaentchen_tbl <- quaentchen %>% select(Lemma) %>% table %>% as.data.frame()

colnames(idee_tbl) <- colnames(fuenk_tbl) <- 
  colnames(handvoll_tbl) <- colnames(tack_tbl) <- 
  colnames(zack_tbl) <- colnames(tick_tbl) <-  
  colnames(spur_tbl) <- colnames(hauch_tbl) <-
  colnames(quaentchen_tbl) <- 
  c("lemma", "Freq_mod")

bisschen_tbl <- bisschen %>% group_by(Lemma) %>% summarise(
  Freq_bisschen = sum(Freq)
)
colnames(bisschen_tbl) <- c("lemma", "Freq_bisschen")

# join dataframes
idee_tbl <- left_join(idee_tbl, decow_sum)
fuenk_tbl <- left_join(fuenk_tbl, decow_sum)
handvoll_tbl <- left_join(handvoll_tbl, decow_sum)
tack_tbl <- left_join(tack_tbl, decow_sum)
tick_tbl <- left_join(tick_tbl, decow_sum)
zack_tbl <- left_join(zack_tbl, decow_sum)
spur_tbl <- left_join(spur_tbl, decow_sum)
hauch_tbl <- left_join(hauch_tbl, decow_sum)
quaentchen_tbl <- left_join(quaentchen_tbl, decow_sum)
bisschen_tbl <- left_join(bisschen_tbl, decow_sum)

# replace NAs by 0
idee_tbl <- replace_na(idee_tbl, list(Freq_mod = 0, Freq = 0))
fuenk_tbl <- replace_na(fuenk_tbl, list(Freq_mod = 0, Freq = 0))
handvoll_tbl <- replace_na(handvoll_tbl, list(Freq_mod = 0, Freq = 0))
tack_tbl <- replace_na(tack_tbl, list(Freq_mod = 0, Freq = 0))
tick_tbl <- replace_na(tick_tbl, list(Freq_mod = 0, Freq = 0))
zack_tbl <- replace_na(zack_tbl, list(Freq_mod = 0, Freq = 0))
hauch_tbl <- replace_na(hauch_tbl, list(Freq_mod = 0, Freq = 0))
spur_tbl <- replace_na(spur_tbl, list(Freq_mod = 0, Freq = 0))
quaentchen_tbl <- replace_na(quaentchen_tbl, list(Freq_mod = 0, Freq = 0))
bisschen_tbl <- replace_na(bisschen_tbl, list(Freq_bisschen = 0, Freq = 0))

# reomove cases where cxn frequency is bigger than
# corpus frequency
idee_tbl <- idee_tbl[which(idee_tbl$Freq_mod <= idee_tbl$Freq),]
fuenk_tbl <- fuenk_tbl[which(fuenk_tbl$Freq_mod <= fuenk_tbl$Freq),]
handvoll_tbl <- handvoll_tbl[which(handvoll_tbl$Freq_mod <= handvoll_tbl$Freq),]
tack_tbl <- tack_tbl[which(tack_tbl$Freq_mod <= tack_tbl$Freq),]
tick_tbl <- tick_tbl[which(tick_tbl$Freq_mod <= tick_tbl$Freq),]
zack_tbl <- zack_tbl[which(zack_tbl$Freq_mod <= zack_tbl$Freq),]
spur_tbl <- spur_tbl[which(spur_tbl$Freq_mod <= spur_tbl$Freq),]
hauch_tbl <- hauch_tbl[which(hauch_tbl$Freq_mod <= hauch_tbl$Freq),]
quaentchen_tbl <- quaentchen_tbl[which(quaentchen_tbl$Freq_mod <= quaentchen_tbl$Freq),]
bisschen_tbl <- bisschen_tbl[which(bisschen_tbl$Freq_bisschen <= bisschen_tbl$Freq),]


# collexeme analysis ------------------------------------------------------

col_idee <- collex(idee_tbl,
       corpsize = 
         sum(pos_tbl[grep("ADJ.*", pos_tbl$pos),]$Freq))# %>%  write_excel_csv("idee_collex.csv")

col_fuenk <- collex(fuenk_tbl,
       corpsize = sum(pos_tbl$Freq)) # %>% write_excel_csv("fuenkchen_collex.csv")

col_handvoll <- collex(handvoll_tbl,
       corpsize = sum(pos_tbl$Freq)) # %>% write_csv("handvoll_collex.csv")

col_tack <- collex(tack_tbl, 
       corpsize = sum(pos_tbl$Freq)) # %>% write_csv("tack_collex.csv")

col_tick <- collex(tick_tbl, 
                   corpsize = sum(pos_tbl$Freq)) # %>% write_csv("tick_collex.csv")

col_zack <- collex(zack_tbl, 
       corpsize = sum(pos_tbl$Freq)) # %>% write_csv("zack_collex.csv")

col_spur <- collex(spur_tbl, 
                   corpsize = sum(pos_tbl$Freq)) # %>% write_csv("spur_collex.csv")


col_hauch <- collex(hauch_tbl, 
                   corpsize = sum(pos_tbl$Freq)) # %>% write_csv("hauch_collex.csv")

col_quaentchen <- collex(quaentchen_tbl, 
                    corpsize = sum(pos_tbl$Freq)) # %>% write_csv("quaentchen_collex.csv")


col_bisschen <- collex(bisschen_tbl, 
                   corpsize = sum(pos_tbl$Freq)) # %>% write_csv("bisschen_collex.csv")
## Warning in if (class(x) == "list") {: the condition has length > 1 and only the
## first element will be used

Collostructional analysis: Results

Here are the results of the collostructional analyses (in alphabetical order).

ein bisschen

ein Hauch

eine Spur

ein Zacken

ein Tick

ein Tacken

eine Handvoll

ein Fünkchen

eine Idee

ein Quäntchen

Network analysis

# first links, then edges

links <- rbind(
  col_idee %>% select(COLLEX, COLL.STR.LOGL) %>% mutate(LEX = "eine Idee") ,
  col_handvoll %>% select(COLLEX, COLL.STR.LOGL) %>% mutate(LEX = "eine Handvoll") ,
  col_fuenk %>% select(COLLEX, COLL.STR.LOGL) %>% mutate(LEX = "ein Fünkchen") ,
  col_tack %>% select(COLLEX, COLL.STR.LOGL) %>% mutate(LEX = "ein Tacken"),
  col_tick %>% select(COLLEX, COLL.STR.LOGL) %>% mutate(LEX = "ein Tick"),
  col_zack %>% select(COLLEX, COLL.STR.LOGL) %>% mutate(LEX = "ein Zacken"),
  col_hauch %>% select(COLLEX, COLL.STR.LOGL) %>% mutate(LEX = "ein Hauch"),
  col_spur %>% select(COLLEX, COLL.STR.LOGL) %>% mutate(LEX = "eine Spur"),
  col_quaentchen %>% select(COLLEX, COLL.STR.LOGL) %>% mutate(LEX = "ein Quäntchen"),
  col_bisschen %>% select(COLLEX, COLL.STR.LOGL) %>% mutate(LEX = "ein bisschen") ) %>%
  mutate(edge_type = LEX) %>%
  group_by(LEX) %>%
  slice(1:100) %>%
  ungroup()

# reorder columns
links <- links[,c(3,1,2,4)] %>% 
  arrange(edge_type)

# create dataframes for links and nodes
nodes_LEX = data.frame(links$LEX) %>%
  distinct() %>%
  rename(name = links.LEX) %>%
  mutate(node_type = name) %>%
  mutate(node_size = 10) %>%
  mutate(text_size = 100) %>%
  mutate(text_fontface = "bold") %>%
  mutate(shape = "circle") %>%
  mutate(label = name) 
nodes_COLLEX = data.frame(links$COLLEX) %>%
  distinct() %>%
  rename(name = links.COLLEX) %>%
  mutate(node_type = "COLLEX") %>%
  mutate(node_size = 1.5) %>%
  mutate(text_size = 1) %>%
  mutate(text_fontface = "plain") %>%
  mutate(label = NA) 
nodes_all = bind_rows(nodes_LEX, nodes_COLLEX) %>% 
  arrange(node_type)

# plot
col_graph <- graph_from_data_frame(links, nodes_all, directed = F)

# set.seed(1985)
set.seed(1995)
# used "kk" layout because it is less spread out
ggraph(col_graph, layout = "kk") +
  geom_edge_link(aes(color = edge_type), show.legend = FALSE,
                 end_cap = circle(.07, 'inches')) +
  scale_edge_color_manual(values = c("#FF0000", "#A7D547", "#FFA500", "#00FFFF", 
                                     "#FF00FF", "#00BFFF", "#008000",  "#CDAD5A", "#00FF00", "#AD7A44")) +
  geom_node_point(aes(color = node_type, size = node_size), show.legend = FALSE) +
  scale_color_manual(values = c("#000000", "#FF0000", "#A7D547", "#FFA500", "#00FFFF", 
                                "#FF00FF", "#00BFFF", "#008000",  "#CDAD5A", "#00FF00", "#AD7A44")) +
  geom_node_text(aes(label = label, size = text_size, fontface = text_fontface), vjust = 1, hjust = 1, show.legend = FALSE) +
  theme_void()
## Warning: Removed 716 rows containing missing values (geom_text).

# decreased width and height so the font size would come out as bigger (or is there a better way?)
# ggsave("network_modifiers_100_kk.png", width = 15, height = 10)


# plot only with selected nodes

# select modifiers
# reorder columns
links2 <- links %>% filter(LEX %in% c("ein bisschen", "ein Tick", 
                                      "eine Idee", "ein Quäntchen") & 
                             edge_type %in% c("ein bisschen", "ein Tick", 
                                             "eine Idee", "ein Quäntchen"))

# create dataframes for links and nodes
nodes_LEX2 = data.frame(links2$LEX) %>%
  distinct() %>%
  rename(name = links2.LEX) %>%
  mutate(node_type = name) %>%
  mutate(node_size = 2) %>%
  mutate(text_size = 4) %>%
  mutate(text_fontface = "bold") 
nodes_COLLEX2 = data.frame(links2$COLLEX) %>%
  distinct() %>%
  rename(name = links2.COLLEX) %>%
  mutate(node_type = "COLLEX") %>%
  mutate(node_size = 1.5) %>%
  mutate(text_size = 2.5) %>%
  mutate(text_fontface = "plain") 
nodes_all2 <- bind_rows(nodes_LEX2, nodes_COLLEX2) %>% 
  arrange(node_type)

# plot
col_graph2 <- graph_from_data_frame(links2, nodes_all2, directed = F)

# plot
set.seed(1995)
ggraph(col_graph2, layout = "fr") +
  geom_edge_link(aes(color = edge_type), show.legend = FALSE,
                 end_cap = circle(.07, 'inches')) +
  scale_edge_color_manual(values = c("#FF0000", "#00FF00", "#FFA500", "#00FFFF", 
                                     "#FF00FF", "#00BFFF", "#008000", "#007000", "#006000")) +
  geom_node_point(aes(color = node_type, size = node_size), show.legend = FALSE) +
  scale_color_manual(values = c("#000000", "#FF0000", "#00FF00", "#FFA500", "#00FFFF", 
                                "#FF00FF", "#00BFFF", "#008000", "#007000", "#006000")) +
  geom_node_text(aes(label = name, size = text_size, fontface = text_fontface), vjust = 1, hjust = 1, show.legend = FALSE) +
  theme_void()

# ggsave("network_modifiers001.png", width = 15, height = 25)
# ggsave("network_selection002.png", width = 40, height = 20)



# Playing around a bit more (Tobias) ----------------------------------------

modifiers = c("eine Idee", "eine Handvoll", "ein Fünkchen", "ein Tacken", "ein Tick", "ein Zacken",
              "ein Hauch", "eine Spur", "ein Quäntchen", "ein bisschen")

# plot with layout "kk"
ggraph(col_graph, layout = "kk") +
  geom_edge_link(aes(color = edge_type), show.legend = FALSE,
                 end_cap = circle(.07, 'inches')) +
  scale_edge_color_manual(values = c("#FF0000", "#A7D547", "#FFA500", "#00FFFF", 
                                     "#FF00FF", "#00BFFF", "#008000",  "#CDAD5A", "#00FF00", "#AD7A44")) +
  geom_node_point(aes(color = node_type, size = node_size), show.legend = FALSE) +
  scale_color_manual(values = c("#000000", "#FF0000", "#A7D547", "#FFA500", "#00FFFF", 
                                "#FF00FF", "#00BFFF", "#008000",  "#CDAD5A", "#00FF00", "#AD7A44")) +
  geom_node_text(aes(label = name, size = text_size, fontface = text_fontface), vjust = 1, hjust = 1, show.legend = FALSE) +
  theme_void()

# ggsave("network_modifiers_kk.png", width = 40, height = 20)

# plot with edge length determined by COLLSTR (logged to decrease the differences; not sure why the inverse was needed but otherwise it was the wrong way round)
ggraph(col_graph, layout = "kk", weights = 1/log(COLL.STR.LOGL)) +
  geom_edge_link(aes(color = edge_type), show.legend = FALSE,
                 end_cap = circle(.07, 'inches')) +
  scale_edge_color_manual(values = c("#FF0000", "#A7D547", "#FFA500", "#00FFFF", 
                                     "#FF00FF", "#00BFFF", "#008000",  "#CDAD5A", "#00FF00", "#AD7A44")) +
  geom_node_point(aes(color = node_type, size = node_size), show.legend = FALSE) +
  scale_color_manual(values = c("#000000", "#FF0000", "#A7D547", "#FFA500", "#00FFFF", 
                                "#FF00FF", "#00BFFF", "#008000",  "#CDAD5A", "#00FF00", "#AD7A44")) +
  geom_node_text(aes(label = name, size = text_size, fontface = text_fontface), vjust = 1, hjust = 1, show.legend = FALSE) +
  theme_void()

# ggsave("network_modifiers_kk_COLLSTR_as_edge_length.png", width = 40, height = 20)

# betweenness centrality as a measure of how 'important' the modifier nodes are for the network, i.e. how many shortest paths pass through each modifier node
betweenness_centrality = betweenness(col_graph, v = modifiers, 
                                     directed = TRUE, weights = NULL, nobigint = TRUE)
betweenness_centrality
##     eine Idee eine Handvoll  ein Fünkchen    ein Tacken      ein Tick 
##      43522.73      65631.07      51563.98      37139.24      31318.06 
##    ein Zacken     ein Hauch     eine Spur ein Quäntchen  ein bisschen 
##      42380.18      71252.07      43060.58      54532.54      85043.55
# number of collexemes that each modifier shares with each other modifier
# note that the same collexeme can be counted several times for different modifier combinations
# first, restrict to collexemes that are linked to at least two modifiers
links_reduced <- links %>%
  group_by(COLLEX) %>%
  filter(n()>1)
# then loop over each modifier and determine how many collexemes it shares with each other modifier
for (word in modifiers) {
  target = word
  links_target = links_reduced %>%
    filter(LEX == target)
  collexeme_neighbours = links_reduced %>%
    filter(COLLEX %in% links_target$COLLEX) %>% 
    filter(LEX != target) %>%
    group_by(LEX) %>%
    count() 
  
  collexeme_neighbours_sum = sum(collexeme_neighbours$n)
  collexeme_neighbours_sd_normalised = round((sd(collexeme_neighbours$n)/sum(collexeme_neighbours$n)), 3)
  # or should sd be divided by sum(collexeme_neighbours$n)?
  # to account for the fact that some modifiers have more shared collexemes than others
  
  print(word)
  # table of no. of shared collexemes for each modifier
  print(collexeme_neighbours)
  # sum of no. of shared collexemes for each modifier
  print(collexeme_neighbours_sum)
  # sd of no. of shared collexemes for each modifier
  print(collexeme_neighbours_sd_normalised)
}
## [1] "eine Idee"
## # A tibble: 8 × 2
## # Groups:   LEX [8]
##   LEX               n
##   <chr>         <int>
## 1 ein bisschen      4
## 2 ein Fünkchen      1
## 3 ein Hauch        10
## 4 ein Quäntchen     3
## 5 ein Tacken       32
## 6 ein Tick         37
## 7 ein Zacken       18
## 8 eine Spur        34
## [1] 139
## [1] 0.108
## [1] "eine Handvoll"
## # A tibble: 2 × 2
## # Groups:   LEX [2]
##   LEX              n
##   <chr>        <int>
## 1 ein bisschen     2
## 2 ein Hauch        1
## [1] 3
## [1] 0.236
## [1] "ein Fünkchen"
## # A tibble: 5 × 2
## # Groups:   LEX [5]
##   LEX               n
##   <chr>         <int>
## 1 ein bisschen      9
## 2 ein Hauch         6
## 3 ein Quäntchen    27
## 4 eine Idee         1
## 5 eine Spur         4
## [1] 47
## [1] 0.218
## [1] "ein Tacken"
## # A tibble: 7 × 2
## # Groups:   LEX [7]
##   LEX               n
##   <chr>         <int>
## 1 ein bisschen      9
## 2 ein Hauch         8
## 3 ein Quäntchen     4
## 4 ein Tick         49
## 5 ein Zacken       30
## 6 eine Idee        32
## 7 eine Spur        32
## [1] 164
## [1] 0.102
## [1] "ein Tick"
## # A tibble: 7 × 2
## # Groups:   LEX [7]
##   LEX               n
##   <chr>         <int>
## 1 ein bisschen      7
## 2 ein Hauch         9
## 3 ein Quäntchen     3
## 4 ein Tacken       49
## 5 ein Zacken       36
## 6 eine Idee        37
## 7 eine Spur        44
## [1] 185
## [1] 0.105
## [1] "ein Zacken"
## # A tibble: 7 × 2
## # Groups:   LEX [7]
##   LEX               n
##   <chr>         <int>
## 1 ein bisschen      5
## 2 ein Hauch         3
## 3 ein Quäntchen     3
## 4 ein Tacken       30
## 5 ein Tick         36
## 6 eine Idee        18
## 7 eine Spur        27
## [1] 122
## [1] 0.114
## [1] "ein Hauch"
## # A tibble: 9 × 2
## # Groups:   LEX [9]
##   LEX               n
##   <chr>         <int>
## 1 ein bisschen      4
## 2 ein Fünkchen      6
## 3 ein Quäntchen    13
## 4 ein Tacken        8
## 5 ein Tick          9
## 6 ein Zacken        3
## 7 eine Handvoll     1
## 8 eine Idee        10
## 9 eine Spur        21
## [1] 75
## [1] 0.081
## [1] "eine Spur"
## # A tibble: 8 × 2
## # Groups:   LEX [8]
##   LEX               n
##   <chr>         <int>
## 1 ein bisschen      7
## 2 ein Fünkchen      4
## 3 ein Hauch        21
## 4 ein Quäntchen     9
## 5 ein Tacken       32
## 6 ein Tick         44
## 7 ein Zacken       27
## 8 eine Idee        34
## [1] 178
## [1] 0.081
## [1] "ein Quäntchen"
## # A tibble: 8 × 2
## # Groups:   LEX [8]
##   LEX              n
##   <chr>        <int>
## 1 ein bisschen    10
## 2 ein Fünkchen    27
## 3 ein Hauch       13
## 4 ein Tacken       4
## 5 ein Tick         3
## 6 ein Zacken       3
## 7 eine Idee        3
## 8 eine Spur        9
## [1] 72
## [1] 0.114
## [1] "ein bisschen"
## # A tibble: 9 × 2
## # Groups:   LEX [9]
##   LEX               n
##   <chr>         <int>
## 1 ein Fünkchen      9
## 2 ein Hauch         4
## 3 ein Quäntchen    10
## 4 ein Tacken        9
## 5 ein Tick          7
## 6 ein Zacken        5
## 7 eine Handvoll     2
## 8 eine Idee         4
## 9 eine Spur         7
## [1] 57
## [1] 0.048
## EXTENSION: store the collexeme counts from the loop as a dataframe, and then visualise them as a heat map

# plot only of shared collexemes
# create reduced dataframes for links and nodes
nodes_reduced_LEX = data.frame(links_reduced$LEX) %>%
  distinct() %>%
  rename(name = links_reduced.LEX) %>%
  mutate(node_type = name) %>%
  mutate(node_size = 3) %>%
  mutate(text_size = 20) %>%
  mutate(text_fontface = "bold") 
nodes_reduced_COLLEX = data.frame(links_reduced$COLLEX) %>%
  distinct() %>%
  rename(name = links_reduced.COLLEX) %>%
  mutate(node_type = "COLLEX") %>%
  mutate(node_size = 2) %>%
  mutate(text_size = 15) %>%
  mutate(text_fontface = "plain") 
nodes_reduced_all = bind_rows(nodes_reduced_LEX, nodes_reduced_COLLEX) %>% 
  arrange(node_type)

# plot
col_graph_reduced <- graph_from_data_frame(links_reduced, nodes_reduced_all, directed = F)

set.seed(1995)
ggraph(col_graph_reduced, layout = "kk") +
  geom_edge_link(aes(color = edge_type), show.legend = FALSE,
                 end_cap = circle(.07, 'inches')) +
  scale_edge_color_manual(values = c("#FF0000", "#00FF00", "#FFA500", "#00FFFF", 
                                     "#FF00FF", "#00BFFF", "#008000", "#6a5acd", "#ffa500", "#ee82ee")) +
  geom_node_point(aes(color = node_type, size = node_size), show.legend = FALSE) +
  scale_color_manual(values = c("#000000", "#FF0000", "#00FF00", "#FFA500", "#00FFFF", 
                                "#FF00FF", "#00BFFF", "#008000",  "#6a5acd", "#ffa500", "#ee82ee")) +
  geom_node_text(aes(label = name, size = text_size, fontface = text_fontface), vjust = 1, hjust = 1, show.legend = FALSE) +
  theme_void()

# ggsave("network_modifiers_shared_collexemes_100kk.png", width = 25, height = 15)
# ggsave("network_modifiers_shared_collexemes_100kk.png", width = 40, height = 20)

Number of shared collexemes

While the collexeme analyses give an impression of the semantics of each construction, another interesting question is how many of the collexemes are shared between the individual constructions. In order to assess this question, we create a word-construction-matrix and visualize it using a heatmap.

# TODO: add network before
# TODO: limit to those displayed in the network

# create long list of lemmas and cxns
lemmas_df <- rbind(
  mutate(select(fuenk, Lemma), cxn = "Fünkchen"),
  mutate(select(hauch, Lemma), cxn = "Hauch"),
  mutate(select(idee, Lemma), cxn = "Idee"),
  mutate(select(quaentchen, Lemma), cxn = "Quäntchen"),
  mutate(select(spur, Lemma), cxn = "Spur"),
  mutate(select(tack_zack[grepl("Tacken", tack_zack$Key, ignore.case = T),], Lemma), cxn = "Tacken"),
  mutate(select(tack_zack[grepl("Zacken", tack_zack$Key, ignore.case = T),], Lemma), cxn = "Zacken"),
  mutate(select(tick, Lemma), cxn = "Tick")
)

# find out how many items are shared between each 2 cxns
cxns <- unique(lemmas_df$cxn)

# empty dataframe
mydf <- data.frame(matrix(NA, nrow = 8, ncol = 8))
colnames(mydf) <- cxns
rownames(mydf) <- cxns

for(i in 1:length(cxns)) {
  for(j in 1:length(cxns)) {
    mydf[j,i] <- length(intersect(unique(filter(lemmas_df, cxn == colnames(mydf)[i])$Lemma),
          unique(filter(lemmas_df, cxn == rownames(mydf)[j])$Lemma)))
    
  }
}


# change to long dataframe
mydf2 <- rownames_to_column(mydf)
mydf_long <- pivot_longer(mydf2, 2:length(mydf2))
colnames(mydf_long) <- c("cxn_A", "cxn_B", "n")

# get a more reliable impression of the number of shared items by dividing the frequency value by the total number of types of the less frequent construction

mytypes <- lemmas_df %>% group_by(cxn) %>% summarise(
  types = length(unique(Lemma))
)

# add n of types for cxn A and B
mydf_long <- left_join(mydf_long, mytypes, by = c("cxn_A" = "cxn"))
mydf_long <- rename(mydf_long, types_A = types)
mydf_long <- left_join(mydf_long, mytypes, by = c("cxn_B" = "cxn"))
mydf_long <- rename(mydf_long, types_B = types)

# get smaller n
mydf_long$n_min_types <- ifelse(mydf_long$types_A < mydf_long$types_B, mydf_long$types_A, mydf_long$types_B)

# "relative" frequency
mydf_long$rel <- mydf_long$n / mydf_long$n_min_types


# heatmap
mydf_long %>% filter(rel < 1) %>% ggplot(aes(x = cxn_A, y = cxn_B, fill = rel, label = n)) + geom_tile() + geom_text() + scale_fill_gradient(low = "yellow", high = "darkred") +
  xlab("Construction") + ylab("Construction") + guides(fill = guide_legend(title = "relative overlap")) + theme_classic() + theme(axis.text.x = element_text(angle=45, hjust=.9))

mydf_long %>% filter(rel < 1) %>% ggplot(aes(x = cxn_A, y = cxn_B, fill = n, label = n)) + geom_tile() + geom_text() + scale_fill_gradient(low = "yellow", high = "darkred") +
  xlab("Construction") + ylab("Construction") + guides(fill = guide_legend(title = "overlap")) + theme_classic() + theme(axis.text.x = element_text(angle=45, hjust=.9))

The same limited to the top 100 collexemes for each construction:

# top 100 collexemes:

top100_collexemes <- tibble(
  "Fünkchen" = col_fuenk %>% select(COLLEX) %>% head(100) %>% unname,
  "Handvoll" = col_handvoll %>% select(COLLEX) %>% head(100) %>% unname,
  "Hauch" =  col_hauch %>% select(COLLEX) %>% head(100) %>% unname,
  "Idee" = col_idee %>% select(COLLEX) %>% head(100) %>% unname,
  "Quäntchen" = col_quaentchen %>% select(COLLEX) %>% head(100) %>% unname,
  "Spur" = col_spur %>% select(COLLEX) %>% head(100) %>% unname,
  "Tacken" = col_tack %>% select(COLLEX) %>% head(100) %>% unname,
  "Tick" = col_tick %>% select(COLLEX) %>% head(100) %>% unname,
  "Zacken" = col_zack %>% select(COLLEX) %>% head(100) %>% unname
)

# remove "$" from column names
colnames(top100_collexemes) <- gsub("[[:punct:]]", "", colnames(top100_collexemes))


# lemmas_df only with top 100 lemmas
# create long list of lemmas and cxns


lemmas_df <- rbind(
  mutate(select(filter(fuenk, Lemma %in% unlist(top100_collexemes$`Fünkchen`)), Lemma), cxn = "Fünkchen"),
  mutate(select(filter(hauch, Lemma %in% unlist(top100_collexemes$Hauch)), Lemma), cxn = "Hauch"),
  mutate(select(filter(idee, Lemma %in% unlist(top100_collexemes$Idee)), Lemma), cxn = "Idee"),
  mutate(select(filter(quaentchen, Lemma %in% unlist(top100_collexemes$`Quäntchen`)), Lemma), cxn = "Quäntchen"),
  mutate(select(filter(spur, Lemma %in% unlist(top100_collexemes$Spur)), Lemma), cxn = "Spur"),
  mutate(select(filter(tack_zack[grepl("Tacken", tack_zack$Key, ignore.case = T),], Lemma %in% unlist(top100_collexemes$Tacken)), Lemma), cxn = "Tacken"),
  mutate(select(filter(tack_zack[grepl("Zacken", tack_zack$Key, ignore.case = T),], Lemma %in% unlist(top100_collexemes$Zacken)), Lemma), cxn = "Zacken"),
  mutate(select(filter(tick, Lemma %in% unlist(top100_collexemes$Tick)), Lemma), cxn = "Tick")
)


# The rest is copy&pasted from above as it remains unchanged:

# empty dataframe
mydf <- data.frame(matrix(NA, nrow = 8, ncol = 8))
colnames(mydf) <- cxns
rownames(mydf) <- cxns

for(i in 1:length(cxns)) {
  for(j in 1:length(cxns)) {
    mydf[j,i] <- length(intersect(filter(lemmas_df, cxn == colnames(mydf)[i])$Lemma,
          filter(lemmas_df, cxn == rownames(mydf)[j])$Lemma))
    
  }
}


# change to long dataframe
mydf2 <- rownames_to_column(mydf)
mydf_long <- pivot_longer(mydf2, 2:length(mydf2))
colnames(mydf_long) <- c("cxn_A", "cxn_B", "n")

# get a more reliable impression of the number of shared items by dividing the frequency value by the total number of types of the less frequent construction

mytypes <- lemmas_df %>% group_by(cxn) %>% summarise(
  types = length(unique(Lemma))
)

# add n of types for cxn A and B
mydf_long <- left_join(mydf_long, mytypes, by = c("cxn_A" = "cxn"))
mydf_long <- rename(mydf_long, types_A = types)
mydf_long <- left_join(mydf_long, mytypes, by = c("cxn_B" = "cxn"))
mydf_long <- rename(mydf_long, types_B = types)

# get smaller n
mydf_long$n_min_types <- ifelse(mydf_long$types_A < mydf_long$types_B, mydf_long$types_A, mydf_long$types_B)

# "relative" frequency
mydf_long$rel <- mydf_long$n / mydf_long$n_min_types


# heatmap
mydf_long %>% filter(rel < 1) %>% ggplot(aes(x = cxn_A, y = cxn_B, fill = rel, label = n)) + geom_tile() + geom_text() + scale_fill_gradient(low = "yellow", high = "darkred") +
  xlab("Construction") + ylab("Construction") + guides(fill = guide_legend(title = "relative overlap")) + theme_classic() + theme(axis.text.x = element_text(angle=45, hjust=.9))

mydf_long %>% filter(rel < 1) %>% ggplot(aes(x = cxn_A, y = cxn_B, fill = n, label = n)) + geom_tile() + geom_text() + scale_fill_gradient(low = "yellow", high = "darkred") +
  xlab("Construction") + ylab("Construction") + guides(fill = guide_legend(title = "overlap")) + theme_classic() + theme(axis.text.x = element_text(angle=45, hjust=.9))

References

  • Schäfer, Roland. 2015. Processing and querying large corpora with the COW14 architecture. In Piotr Bański, Hanno Biber, Evelyn Breiteneder, Marc Kupietz, Harald Lüngen & Andreas Witt (eds.), Challenges in the Management of Large Corpora (CMLC-3), 28–34.

  • Schäfer, Roland & Felix Bildhauer. 2012. Building Large Corpora from the Web Using a New Efficient Tool Chain. In Nicoletta Calzolari, Khalid Choukri, Terry Declerck, Mehmet Uğur Doğan, Bente Maegaard, Joseph Mariani, Asuncion Moreno, Jan Odijk & Stelios Piperidis (eds.), Proceedings of LREC 2012, 486–493.